{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 输入语料库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['this', 'is', 'the', 'first', 'document'], ['this', 'is', 'the', 'second', 'second', 'document'], ['and', 'the', 'third', 'one'], ['is', 'this', 'the', 'first', 'document']]\n"
     ]
    }
   ],
   "source": [
    "corpus = ['this is the first document',\n",
    "        'this is the second second document',\n",
    "        'and the third one',\n",
    "        'is this the first document']\n",
    "words_list = list()\n",
    "for i in range(len(corpus)):\n",
    "    words_list.append(corpus[i].split(' '))\n",
    "print(words_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 统计词语数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Counter({'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}), Counter({'second': 2, 'this': 1, 'is': 1, 'the': 1, 'document': 1}), Counter({'and': 1, 'the': 1, 'third': 1, 'one': 1}), Counter({'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1})]\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "count_list = list()\n",
    "for i in range(len(words_list)):\n",
    "    count = Counter(words_list[i])\n",
    "    count_list.append(count)\n",
    "print(count_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. 自己动手写 tf-idf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "def tf(word, count):\n",
    "    return count[word] / sum(count.values())\n",
    "\n",
    "\n",
    "def idf(word, count_list):\n",
    "    n_contain = sum([1 for count in count_list if word in count])\n",
    "    return math.log(len(count_list) / (1 + n_contain))\n",
    "\n",
    "\n",
    "def tf_idf(word, count, count_list):\n",
    "    return tf(word, count) * idf(word, count_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 输出结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第 1 个文档 TF-IDF 统计信息\n",
      "\tword: first, TF-IDF: 0.05754\n",
      "\tword: this, TF-IDF: 0.0\n",
      "\tword: is, TF-IDF: 0.0\n",
      "\tword: document, TF-IDF: 0.0\n",
      "\tword: the, TF-IDF: -0.04463\n",
      "第 2 个文档 TF-IDF 统计信息\n",
      "\tword: second, TF-IDF: 0.23105\n",
      "\tword: this, TF-IDF: 0.0\n",
      "\tword: is, TF-IDF: 0.0\n",
      "\tword: document, TF-IDF: 0.0\n",
      "\tword: the, TF-IDF: -0.03719\n",
      "第 3 个文档 TF-IDF 统计信息\n",
      "\tword: and, TF-IDF: 0.17329\n",
      "\tword: third, TF-IDF: 0.17329\n",
      "\tword: one, TF-IDF: 0.17329\n",
      "\tword: the, TF-IDF: -0.05579\n",
      "第 4 个文档 TF-IDF 统计信息\n",
      "\tword: first, TF-IDF: 0.05754\n",
      "\tword: is, TF-IDF: 0.0\n",
      "\tword: this, TF-IDF: 0.0\n",
      "\tword: document, TF-IDF: 0.0\n",
      "\tword: the, TF-IDF: -0.04463\n"
     ]
    }
   ],
   "source": [
    "for i, count in enumerate(count_list):\n",
    "    print(\"第 {} 个文档 TF-IDF 统计信息\".format(i + 1))\n",
    "    scores = {word : tf_idf(word, count, count_list) for word in count}\n",
    "    sorted_word = sorted(scores.items(), key = lambda x : x[1], reverse=True)\n",
    "    for word, score in sorted_word:\n",
    "        print(\"\\tword: {}, TF-IDF: {}\".format(word, round(score, 5)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. 使用 gensim 提取文本的tfidf特征"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 得到每个词的id值及词频"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (2, 1), (3, 1), (4, 1), (5, 2)], [(3, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]\n"
     ]
    }
   ],
   "source": [
    "from gensim import corpora\n",
    "# 赋给语料库中每个词(不重复的词)一个整数id\n",
    "dic = corpora.Dictionary(words_list)\n",
    "new_corpus = [dic.doc2bow(words) for words in words_list]\n",
    "# 元组中第一个元素是词语在词典中对应的id，第二个元素是词语在文档中出现的次数\n",
    "print(new_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'document': 0, 'first': 1, 'is': 2, 'the': 3, 'this': 4, 'second': 5, 'and': 6, 'one': 7, 'third': 8}\n"
     ]
    }
   ],
   "source": [
    "# 通过下面的方法可以看到语料库中每个词对应的id\n",
    "print(dic.token2id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练gensim模型并且保存它以便后面的使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)], [(0, 0.10212329019650272), (2, 0.10212329019650272), (4, 0.10212329019650272), (5, 0.9842319344536239)], [(6, 0.5773502691896258), (7, 0.5773502691896258), (8, 0.5773502691896258)], [(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]]\n"
     ]
    }
   ],
   "source": [
    "# 训练模型并保存\n",
    "from gensim import models\n",
    "tfidf = models.TfidfModel(new_corpus)\n",
    "tfidf.save(\"tfidf.model\")\n",
    "# 载入模型\n",
    "tfidf = models.TfidfModel.load(\"tfidf.model\")\n",
    "# 使用这个训练好的模型得到单词的tfidf值\n",
    "tfidf_vec = []\n",
    "for i in range(len(corpus)):\n",
    "    string = corpus[i]\n",
    "    string_bow = dic.doc2bow(string.lower().split())\n",
    "    string_tfidf = tfidf[string_bow]\n",
    "    tfidf_vec.append(string_tfidf)\n",
    "# 输出 词语id与词语tfidf值\n",
    "print(tfidf_vec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]\n"
     ]
    }
   ],
   "source": [
    "# 测试一个句子\n",
    "test_words = \"i is the first one\"\n",
    "string_bow = dic.doc2bow(string.lower().split())\n",
    "string_tfidf = tfidf[string_bow]\n",
    "print(string_tfidf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. 使用sklearn提取文本tfidf特征"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 调用函数包 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']\n",
      "{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}\n",
      "[[0.         0.43877674 0.54197657 0.43877674 0.         0.\n",
      "  0.35872874 0.         0.43877674]\n",
      " [0.         0.27230147 0.         0.27230147 0.         0.85322574\n",
      "  0.22262429 0.         0.27230147]\n",
      " [0.55280532 0.         0.         0.         0.55280532 0.\n",
      "  0.28847675 0.55280532 0.        ]\n",
      " [0.         0.43877674 0.54197657 0.43877674 0.         0.\n",
      "  0.35872874 0.         0.43877674]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tfidf_vec = TfidfVectorizer()\n",
    "tfidf_matrix = tfidf_vec.fit_transform(corpus)\n",
    "# 得到语料库所有不重复的词\n",
    "print(tfidf_vec.get_feature_names())\n",
    "# 得到每个单词对应的id值\n",
    "print(tfidf_vec.vocabulary_)\n",
    "# 得到每个句子所对应的向量，向量里数字的顺序是按照词语的id顺序来的\n",
    "print(tfidf_matrix.toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
